# =============================================================================
# =============================================================================
# Quick script for some descriptive tables of Clarity data. This is the basis for Appendix Table A2
# =============================================================================
# =============================================================================

# Import the Clarity Data (Full Sample)
cif = pd.read_parquet(cdd['p_d_cl_c'] + r'\cif_q.parquet')
ctf = pd.read_parquet(cdd['p_d_cl_c'] + r'\ctf_q.parquet')


#Adjust some variables
cif['inq_payday'] = cif[['inq_C1','inq_C2']].sum(axis=1)
cif['inq_inst'] = cif[['inq_C3','inq_C6']].sum(axis=1)
cif['inq_r2own'] = cif[['inq_C5','inq_C13']].sum(axis=1)
cif['inq_payday_gt0'] = 1*(cif['inq_payday'] > 0)
cif['inq_inst_gt0'] = 1*(cif['inq_inst'] > 0)
cif['inq_payday_gt1'] = 1*(cif['inq_payday'] > 1)
cif['inq_inst_gt1'] = 1*(cif['inq_inst'] > 1)


chars_i = ['age','income']
inq_i = ['inq_payday','inq_payday_gt0','inq_payday_gt1','inq_inst','inq_inst_gt0','inq_inst_gt1']
trans_t = ['ntrans_d','ch_d']


ccf = pd.merge(cif,ctf[['id','date'] + trans_t].rename(columns={'date':'dateq'}), how='left',on=['id','dateq'])
ccf['tdata'] = 1*(ccf.groupby('id')['ntrans_d'].transform('max')>=0)
for v in trans_t:
    ccf[v] = np.where(ccf['tdata']==1,ccf[v].fillna(0),ccf[v])

del cif, ctf 
gc.collect()

varlab = {'age':'Age','income':'Income',
          'inq_payday':'Payday',
          'inq_payday_gt0':'Payday Any',
          'inq_payday_gt1':'Payday (gt1)',
          'inq_inst':'Installment',
          'inq_inst_gt0':'Installment Any',
          'inq_inst_gt1':'Installment (gt1)',
          'ntrans_d':'Transactions (qe)',
          'ch_d':'Highest Credit (qe)'}
inq_i = ['inq_payday_gt0','inq_payday_gt1','inq_inst_gt0','inq_inst_gt1']
ss = ccf[chars_i + inq_i + trans_t].describe(percentiles=[0.25,0.5,0.75,0.9,0.95,0.99]).T.reset_index()
ss['index'] = ss['index'].map(varlab)

colrn={'index':'Variables','count':'N','mean':'Mean','std':'Std. Dev.'}

def int2str(x):
    return str(int(x))
def float2str(x,sigdig=3):
    return str(np.round(x,sigdig))
form = {**{'Mean':float2str,'Std. Dev.':float2str,'N':int2str}, **{v:int2str for v in ldiff(list(ss),['mean','std'])}}
# ss.rename(columns=colrn).to_excel(cdd['p_rt'] + r'\Clarity\Clarity_SummaryStats_FullSample.xlsx')

a=cif.sample(1000)
list(ccf)



# Import the Clarity Analaysis Data
ccf = pd.merge(pd.read_parquet(cdd['p_d_cl_pa'] + r'\cifp_q.parquet'),
               pd.read_parquet(cdd['p_d_cl_pa'] + r'\ctfp_q.parquet'),how='inner',on=['id', 'date', 'open','cohort','zip','distance_t2cf','distance','distance_t2cf_rank','etime','treated','cohort_cf','open_cf','ic','ctype','pop2sqkm_wq10','state','housing','pay_freq','dor','income','dob','etimey',])

#Adjust some variables
ccf['age'] = (ccf['date'] - ccf['dob']).dt.total_seconds() / (3600*24*365/12)

ccf['inq_payday'] = ccf[['inq_C1','inq_C2']].sum(axis=1)
ccf['inq_inst'] = ccf[['inq_C3','inq_C6']].sum(axis=1)
ccf['inq_r2own'] = ccf[['inq_C5','inq_C13']].sum(axis=1)

ccf = ccf[(ccf.open >= dt.datetime(2014,7,1))&(ccf.open < dt.datetime(2020,1,1))]
ccf = ccf[(ccf.ic>5000)&((ccf.distance<5000))]
ccf['inq_payday_gt0'] = 1*(ccf['inq_payday'] > 0)
ccf['inq_inst_gt0'] = 1*(ccf['inq_inst'] > 0)
ccf['inq_payday_gt1'] = 1*(ccf['inq_payday'] > 1)
ccf['inq_inst_gt1'] = 1*(ccf['inq_inst'] > 1)

ss = ccf[chars_i + inq_i + trans_t].describe(percentiles=[0.25,0.5,0.75,0.9,0.95,0.99]).T.reset_index()
ss['index'] = ss['index'].map(varlab)

form = {**{'Mean':float2str,'Std. Dev.':float2str,'N':int2str}, **{v:int2str for v in ldiff(list(ss),['mean','std'])}}
# ss.rename(columns=colrn).to_excel(cdd['p_rt'] + r'\Clarity\Clarity_SummaryStats_FullSample.xlsx')





